city = 'madrid'
month = '201909'
filename_in = 'src/data/' + city + '-' + month + '-listings-CLEAN.csv'
# modelos para la comparativa
enabled_models = [
'LINEAL',
'RIDGE',
'LASSO',
'SGD',
'R.FOREST',
'ADABOOST',
'XGBOOST',
'CATBOOST'
]
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
# from statsmodels.stats.outliers_influence import variance_inflation_factor
import catboost as cb
from sklearn.svm import NuSVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import shap
%run src/utils.py
coefs = {}
metrics = {}
def collect_results(columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True):
# coefs
if skip_coef != True:
method_coefs = {}
if hasattr(model, '__intercept'):
method_coefs['__intercept'] = model.intercept_
for i in range(len(columns.values)):
method_coefs[columns.values[i]] = abs(model.coef_[i])
coefs[method] = method_coefs
df_coefs = pd.DataFrame(coefs)
df_coefs = df_coefs.sort_values(by=method, ascending=False)
display(df_coefs)
# metrics
metrics[method] = {
'R2 MEAN CV':r2_mean_cv.round(3),
'R2 STD CV':r2_std_cv.round(3),
'R2':r2.round(3),
'MAE':mae.round(3),
'MSE':mse.round(3)
}
display(pd.DataFrame(metrics))
def print_feature_importances(method, importances, df):
feature_score = pd.DataFrame(list(zip(df.dtypes.index, importances)), columns=['Feature','Score'])
feature_score = feature_score.sort_values(by='Score',
ascending=True,
inplace=False,
kind='quicksort',
na_position='last')
fig = go.Figure(
go.Bar(
x=feature_score['Score'],
y=feature_score['Feature'],
orientation='h'
)
)
fig.update_layout(
title=method + " Feature Importance Ranking",
height=25*len(feature_score)
)
fig.show()
df = pd.read_csv(filename_in)
df.info()
useful_cols = [
'accommodates',
'bathrooms',
'bedrooms',
'cancellation_policy',
'cleaning_fee',
'extra_people',
'guests_included',
'has_air_conditioning',
'has_bed_linens',
'has_coffee_maker',
'has_cooking_basics',
'has_dishes_and_silverware',
'has_elevator',
'has_essentials',
'has_family/kid_friendly',
'has_first_aid_kit',
'has_hair_dryer',
'has_hangers',
'has_heating',
'has_hot_water',
'has_iron',
'has_kitchen',
'has_laptop_friendly_workspace',
'has_license',
'has_long_term_stays_allowed',
'has_microwave',
'has_no_stairs_or_steps_to_enter',
'has_oven',
'has_refrigerator',
'has_shampoo',
'has_stove',
'has_tv',
'has_washer',
'has_wifi',
'instant_bookable',
'latitude',
'longitude',
'maximum_nights_avg_ntm',
'minimum_nights_avg_ntm',
'neighbourhood',
'price',
'property_type',
'room_type',
'security_deposit'
]
useless_cols = [
'district',
'neighbourhood',
'income_med_occupation',
'price_med_occupation_per_accommodate',
'activity_months',
'host_response_time',
'first_review',
'last_review',
'number_of_reviews',
'number_of_reviews_ltm',
'review_scores_rating',
'review_scores_accuracy',
'review_scores_cleanliness',
'review_scores_checkin',
'review_scores_communication',
'review_scores_location',
'review_scores_value',
'reviews_per_month'
]
highly_corr_cols = [
'host_verified_by_selfie'
]
df.drop([*useless_cols, *highly_corr_cols], axis=1, errors='ignore', inplace=True)
df.shape
print(df.shape)
dfd = pd.get_dummies(df)
print(dfd.shape)
target = 'price'
features = list(dfd.columns)
features.remove(target)
"""
vif = pd.DataFrame()
vif['vif'] = [variance_inflation_factor(dfd[features].values, i) for i in range(dfd[features].shape[1])]
vif['feature'] = dfd[features].columns
vif_results = vif.round(1).sort_values(by='vif', ascending=False)
"""
"""
collineal_features = vif_results[vif_results['vif'] > 20]['feature'].values
print('Collineal features: ', sorted(collineal_features))
df.drop(collineal_features, axis=1, inplace=True)
"""
"""
features = list(dfd.columns)
features.remove(target)
dfd.shape
"""
x_train, x_test, y_train, y_test = train_test_split(
dfd[features],
dfd[target],
test_size=0.3,
random_state=42
)
x_train = x_train.astype(float) # prevent conversion warnings
alphas = np.array([100000, 10000, 1000, 100, 1, 0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001])
method = 'LINEAL'
if method in enabled_models:
model = LinearRegression(normalize=True)
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', model)
])
param_grid = {}
regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
regressor.fit(x_train, y_train)
print('best score', regressor.best_score_)
print('best estimator', regressor.best_estimator_.named_steps['model'])
scores = cross_val_score(regressor, x_train, y_train, cv=5)
r2_mean_cv = np.mean(scores)
r2_std_cv = np.std(scores)
y_pred = regressor.predict(x_test)
r2 = r2_score(y_test.to_list(), y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
if method in enabled_models:
collect_results(dfd[features].columns, regressor, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
method = 'RIDGE'
if method in enabled_models:
model = Ridge(normalize=True, random_state=42)
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', model)
])
param_grid = {
'model__alpha':[0.1],
'model__tol':[0.001]
}
regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
regressor.fit(x_train, y_train)
print('best score', regressor.best_score_)
print('best estimator', regressor.best_estimator_.named_steps['model'])
scores = cross_val_score(regressor, x_train, y_train, cv=5)
r2_mean_cv = np.mean(scores)
r2_std_cv = np.std(scores)
y_pred = regressor.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
if method in enabled_models:
collect_results(dfd[features].columns, regressor.best_estimator_, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
method = 'LASSO'
if method in enabled_models:
model = Lasso(normalize=True, random_state=42)
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', model)
])
param_grid = {
'model__alpha':[0.001, 0.1],
'model__tol':[0.001, 0.01],
'model__max_iter':[100, 200]
}
regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
regressor.fit(x_train, y_train)
print('best score', regressor.best_score_)
print('best estimator', regressor.best_estimator_.named_steps['model'])
scores = cross_val_score(regressor, x_train, y_train, cv=5)
r2_mean_cv = np.mean(scores)
r2_std_cv = np.std(scores)
y_pred = regressor.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
if method in enabled_models:
collect_results(dfd[features].columns, regressor.best_estimator_, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
method = 'SGD'
if method in enabled_models:
model = SGDRegressor(random_state=42)
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', model)
])
param_grid = {
'model__alpha':alphas,
'model__penalty':['l1', 'l2'],
'model__tol':[0.3],
'model__max_iter':[500]
}
regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
regressor.fit(x_train, y_train)
print('best score', regressor.best_score_)
print('best estimator', regressor.best_estimator_.named_steps['model'])
scores = cross_val_score(regressor, x_train, y_train, cv=5)
r2_mean_cv = np.mean(scores)
r2_std_cv = np.std(scores)
y_pred = regressor.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
if method in enabled_models:
collect_results(dfd[features].columns, regressor.best_estimator_.named_steps['model'], method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
method = 'R.FOREST'
if method in enabled_models:
model = RandomForestRegressor(random_state=42)
pipeline = Pipeline([('model', model)])
param_grid = {
#'model__max_depth':[15],
'model__n_estimators':[200]
}
regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
regressor.fit(x_train, y_train)
print('best score', regressor.best_score_)
print('best estimator', regressor.best_estimator_.named_steps['model'])
scores = cross_val_score(regressor, x_train, y_train, cv=5)
r2_mean_cv = np.mean(scores)
r2_std_cv = np.std(scores)
y_pred = regressor.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
if method in enabled_models:
collect_results(dfd[features].columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
importances = regressor.best_estimator_.named_steps['model'].feature_importances_
print_feature_importances(method, importances, dfd[features])
example = x_train.iloc[0].copy()
smodel = regressor.best_estimator_.named_steps['model']
# ejemplo 1 piso mediano en Barrio del Pilar con las siguientes características:
example['latitude'] = 40.4762958
example['longitude'] = -3.7045541
example['accommodates'] = 4
example['bathrooms'] = 1
example['bedrooms'] = 2
example['security_deposit'] = 35
example['cleaning_fee'] = 50
example['guests_included'] = 2
example['extra_people'] = 15
example['minimum_nights_avg_ntm'] = 1
example['maximum_nights_avg_ntm'] = 60
example['instant_bookable'] = 0
example['has_wifi'] = 1
example['has_essentials'] = 1
example['has_kitchen'] = 1
example['has_heating'] = 1
example['has_washer'] = 1
example['has_hangers'] = 1
example['has_tv'] = 1
example['has_hair_dryer'] = 1
example['has_iron'] = 1
example['has_shampoo'] = 1
example['has_laptop_friendly_workspace'] = 1
example['has_air_conditioning'] = 1
example['has_hot_water'] = 1
example['has_elevator'] = 1
example['has_refrigerator'] = 1
example['has_dishes_and_silverware'] = 1
example['has_microwave'] = 1
example['has_bed_linens'] = 1
example['has_no_stairs_or_steps_to_enter'] = 1
example['has_coffee_maker'] = 1
example['has_cooking_basics'] = 1
example['has_family/kid_friendly'] = 1
example['has_long_term_stays_allowed'] = 0
example['has_first_aid_kit'] = 0
example['has_oven'] = 0
example['has_stove'] = 0
example['has_license'] = 1
example['property_type_Apartment'] = 1
example['property_type_Chalet'] = 0
example['property_type_Condominium'] = 0
example['property_type_House'] = 0
example['property_type_Loft'] = 0
example['room_type_Entire home/apt'] = 1
example['room_type_Private room'] = 0
example['room_type_Shared room'] = 0
example['cancellation_policy_flexible'] = 0
example['cancellation_policy_moderate'] = 0
example['cancellation_policy_strict'] = 0
example['cancellation_policy_strict_14_with_grace_period'] = 1
example['cancellation_policy_super_strict_30'] = 0
example['cancellation_policy_super_strict_60'] = 0
example_prediction = smodel.predict([example])
print(example_prediction)
# ejemplo 2 piso pequeño en La Latina con las siguientes características:
example['latitude'] = 40.4120087
example['longitude'] = -3.7092935
example['accommodates'] = 2
example['bathrooms'] = 1
example['bedrooms'] = 1
example['security_deposit'] = 50
example['cleaning_fee'] = 50
example['guests_included'] = 2
example['extra_people'] = 0
example['minimum_nights_avg_ntm'] = 1
example['maximum_nights_avg_ntm'] = 15
example['instant_bookable'] = 1
example['has_wifi'] = 1
example['has_essentials'] = 1
example['has_kitchen'] = 1
example['has_heating'] = 1
example['has_washer'] = 1
example['has_hangers'] = 1
example['has_tv'] = 0
example['has_hair_dryer'] = 1
example['has_iron'] = 1
example['has_shampoo'] = 1
example['has_laptop_friendly_workspace'] = 1
example['has_air_conditioning'] = 1
example['has_hot_water'] = 1
example['has_elevator'] = 0
example['has_refrigerator'] = 1
example['has_dishes_and_silverware'] = 1
example['has_microwave'] = 1
example['has_bed_linens'] = 1
example['has_no_stairs_or_steps_to_enter'] = 1
example['has_coffee_maker'] = 1
example['has_cooking_basics'] = 1
example['has_family/kid_friendly'] = 1
example['has_long_term_stays_allowed'] = 0
example['has_first_aid_kit'] = 0
example['has_oven'] = 0
example['has_stove'] = 0
example['has_license'] = 0
example['property_type_Apartment'] = 1
example['property_type_Chalet'] = 0
example['property_type_Condominium'] = 0
example['property_type_House'] = 0
example['property_type_Loft'] = 0
example['room_type_Entire home/apt'] = 1
example['room_type_Private room'] = 0
example['room_type_Shared room'] = 0
example['cancellation_policy_flexible'] = 0
example['cancellation_policy_moderate'] = 0
example['cancellation_policy_strict'] = 0
example['cancellation_policy_strict_14_with_grace_period'] = 0
example['cancellation_policy_super_strict_30'] = 1
example['cancellation_policy_super_strict_60'] = 0
example_prediction = smodel.predict([example])
print(example_prediction)
smodel = regressor.best_estimator_.named_steps['model']
explainer = shap.TreeExplainer(smodel)
shap.initjs()
shap_values = explainer.shap_values(dfd[features])
# summarize the effects of all the features
shap.summary_plot(shap_values, dfd[features], plot_type="bar")
# summarize the effects of all the features
shap.summary_plot(shap_values, dfd[features])
shap.dependence_plot('guests_included', shap_values, dfd[features], interaction_index=None)
shap.dependence_plot('extra_people', shap_values, dfd[features], interaction_index=None)
shap.dependence_plot('accommodates', shap_values, dfd[features], interaction_index=None)
shap.dependence_plot('cleaning_fee', shap_values, dfd[features], interaction_index=None)
shap.dependence_plot('longitude', shap_values, dfd[features], interaction_index=None)
shap.dependence_plot('longitude', shap_values, dfd[features], interaction_index='latitude')
shap.dependence_plot('latitude', shap_values, dfd[features], interaction_index=None)
shap.dependence_plot('latitude', shap_values, dfd[features], interaction_index='longitude')
# visualize the 4000th prediction's explanation
i = 4000
print(dfd.iloc[i,:])
shap.force_plot(explainer.expected_value, shap_values[i,:], dfd[features].iloc[i,:])
# visualize the 8000th prediction's explanation
i = 8000
print(dfd.iloc[i,:])
shap.force_plot(explainer.expected_value, shap_values[i,:], dfd[features].iloc[i,:])
# visualize the 12000th prediction's explanation
i = 12000
print(dfd.iloc[i,:])
shap.force_plot(explainer.expected_value, shap_values[i,:], dfd[features].iloc[i,:])
method = 'ADABOOST'
if method in enabled_models:
model = AdaBoostRegressor(random_state=42)
pipeline = Pipeline([('model', model)])
param_grid = {
'model__n_estimators':[175],
'model__learning_rate':[0.01]
}
regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=2)
regressor.fit(x_train, y_train)
print('best score', regressor.best_score_)
print('best estimator', regressor.best_estimator_.named_steps['model'])
scores = cross_val_score(regressor, x_train, y_train, cv=2)
r2_mean_cv = np.mean(scores)
r2_std_cv = np.std(scores)
y_pred = regressor.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
if method in enabled_models:
collect_results(dfd[features].columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
importances = regressor.best_estimator_.named_steps['model'].feature_importances_
print_feature_importances(method, importances, dfd[features])
method = 'XGBOOST'
if method in enabled_models:
model = XGBRegressor(random_state=42)
pipeline = Pipeline([('model', model)])
param_grid = {
'model__depth':[10],
'model__iterations':[150],
'model__learning_rate':[0.2]
}
regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
regressor.fit(x_train, y_train)
print('best score', regressor.best_score_)
print('best estimator', regressor.best_estimator_.named_steps['model'])
scores = cross_val_score(regressor, x_train, y_train, cv=5)
r2_mean_cv = np.mean(scores)
r2_std_cv = np.std(scores)
y_pred = regressor.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
if method in enabled_models:
collect_results(dfd[features].columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
importances = regressor.best_estimator_.named_steps['model'].feature_importances_
print_feature_importances(method, importances, dfd[features])
method = 'CATBOOST'
if method in enabled_models:
model = cb.CatBoostRegressor(verbose=0, random_seed=42)
pipeline = Pipeline([('model', model)])
param_grid = {
'model__depth':[10],
'model__iterations':[150],
'model__learning_rate':[0.1]
}
regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
regressor.fit(x_train, y_train)
print('best score', regressor.best_score_)
print('best estimator', regressor.best_estimator_.named_steps['model'].get_params())
scores = cross_val_score(regressor, x_train, y_train, cv=5)
r2_mean_cv = np.mean(scores)
r2_std_cv = np.std(scores)
y_pred = regressor.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
if method in enabled_models:
collect_results(dfd[features].columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
importances = regressor.best_estimator_.named_steps['model'].feature_importances_
print_feature_importances(method, importances, dfd[features])
Modelo base seleccionado: Random Forest